In [1]:
%matplotlib inline

import os

import pickle

import numpy as np
import pandas as pd

from Bio import SeqIO

from kindel import kindel
In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
In [3]:
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff

import cufflinks as cf

cf.go_offline()
In [44]:
from scipy.stats import wilcoxon
from scipy.stats import mannwhitneyu
In [4]:
reads_path = '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/'
bam_path = '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/'
bam_sub_path = '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/'

Needed first time around

In [5]:
reads_fns = [fn for fn in os.listdir(reads_path) if fn.endswith('.gz')]

ids_reads = {}
for fn in reads_fns:
    id = fn.replace('.R12.fastq.gz','').replace('HCV_AVU_AB_','')
    ids_reads[id] = reads_path + fn
ids_reads
Out[5]:
{'1.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_1.1.R12.fastq.gz',
 '1.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_1.2.R12.fastq.gz',
 '1.3': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_1.3.R12.fastq.gz',
 '1.4': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_1.4.R12.fastq.gz',
 '1.5': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_1.5.R12.fastq.gz',
 '10.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_10.1.R12.fastq.gz',
 '10.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_10.2.R12.fastq.gz',
 '11.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_11.1.R12.fastq.gz',
 '11.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_11.2.R12.fastq.gz',
 '12.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_12.1.R12.fastq.gz',
 '12.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_12.2.R12.fastq.gz',
 '13.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_13.1.R12.fastq.gz',
 '13.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_13.2.R12.fastq.gz',
 '14.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_14.1.R12.fastq.gz',
 '14.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_14.2.R12.fastq.gz',
 '2.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_2.1.R12.fastq.gz',
 '2.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_2.2.R12.fastq.gz',
 '2.3': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_2.3.R12.fastq.gz',
 '2.4': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_2.4.R12.fastq.gz',
 '3.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_3.1.R12.fastq.gz',
 '3.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_3.2.R12.fastq.gz',
 '3.3': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_3.3.R12.fastq.gz',
 '3.4': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_3.4.R12.fastq.gz',
 '3.5': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_3.5.R12.fastq.gz',
 '4.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_4.1.R12.fastq.gz',
 '4.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_4.2.R12.fastq.gz',
 '5.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_5.1.R12.fastq.gz',
 '5.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_5.2.R12.fastq.gz',
 '6.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_6.1.R12.fastq.gz',
 '6.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_6.2.R12.fastq.gz',
 '7.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_7.1.R12.fastq.gz',
 '7.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_7.2.R12.fastq.gz',
 '8.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_8.1.R12.fastq.gz',
 '8.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_8.2.R12.fastq.gz',
 '8.3': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_8.3.R12.fastq.gz',
 '9.1': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_9.1.R12.fastq.gz',
 '9.2': '/Users/Bede/Research/Datasets/phe_hcv_tf/reads_int/HCV_AVU_AB_9.2.R12.fastq.gz'}
In [6]:
bams_fns = [fn for fn in os.listdir(bam_path) if fn.endswith('.bam')]

ids_bams = {}
for fn in bams_fns:
    id = fn.replace('.bam','')
    ids_bams[id] = bam_path + fn
ids_bams
Out[6]:
{'1.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/1.1.bam',
 '1.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/1.2.bam',
 '1.3': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/1.3.bam',
 '1.4': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/1.4.bam',
 '1.5': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/1.5.bam',
 '10.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/10.1.bam',
 '10.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/10.2.bam',
 '11.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/11.1.bam',
 '11.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/11.2.bam',
 '12.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/12.1.bam',
 '12.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/12.2.bam',
 '13.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/13.1.bam',
 '13.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/13.2.bam',
 '14.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/14.1.bam',
 '14.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/14.2.bam',
 '2.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/2.1.bam',
 '2.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/2.2.bam',
 '2.3': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/2.3.bam',
 '2.4': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/2.4.bam',
 '3.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/3.1.bam',
 '3.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/3.2.bam',
 '3.3': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/3.3.bam',
 '3.4': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/3.4.bam',
 '3.5': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/3.5.bam',
 '4.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/4.1.bam',
 '4.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/4.2.bam',
 '5.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/5.1.bam',
 '5.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/5.2.bam',
 '6.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/6.1.bam',
 '6.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/6.2.bam',
 '7.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/7.1.bam',
 '7.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/7.2.bam',
 '8.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/8.1.bam',
 '8.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/8.2.bam',
 '8.3': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/8.3.bam',
 '9.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/9.1.bam',
 '9.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings/9.2.bam'}
In [13]:
bams_sub_fns = [fn for fn in os.listdir(bam_sub_path) if fn.endswith('.bam')]

ids_bams_sub = {}
for fn in bams_sub_fns:
    id = fn.replace('.100k.bam','')
    ids_bams_sub[id] = bam_sub_path + fn
ids_bams_sub
Out[13]:
{'1.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/1.1.100k.bam',
 '1.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/1.2.100k.bam',
 '1.3': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/1.3.100k.bam',
 '1.4': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/1.4.100k.bam',
 '1.5': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/1.5.100k.bam',
 '10.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/10.1.100k.bam',
 '10.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/10.2.100k.bam',
 '11.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/11.1.100k.bam',
 '11.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/11.2.100k.bam',
 '12.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/12.1.100k.bam',
 '12.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/12.2.100k.bam',
 '13.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/13.1.100k.bam',
 '13.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/13.2.100k.bam',
 '14.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/14.1.100k.bam',
 '14.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/14.2.100k.bam',
 '2.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/2.1.100k.bam',
 '2.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/2.2.100k.bam',
 '2.3': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/2.3.100k.bam',
 '2.4': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/2.4.100k.bam',
 '3.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/3.1.100k.bam',
 '3.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/3.2.100k.bam',
 '3.3': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/3.3.100k.bam',
 '3.4': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/3.4.100k.bam',
 '3.5': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/3.5.100k.bam',
 '4.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/4.1.100k.bam',
 '4.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/4.2.100k.bam',
 '5.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/5.1.100k.bam',
 '5.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/5.2.100k.bam',
 '6.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/6.1.100k.bam',
 '6.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/6.2.100k.bam',
 '7.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/7.1.100k.bam',
 '7.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/7.2.100k.bam',
 '8.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/8.1.100k.bam',
 '8.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/8.2.100k.bam',
 '8.3': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/8.3.100k.bam',
 '9.1': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/9.1.100k.bam',
 '9.2': '/Users/Bede/Research/Analyses/phe_hcv_tf_kindel/cns_mappings_sub/9.2.100k.bam'}

Pull records from fasta to individual files

In [19]:
# for id, record in ids_records.items():
#     SeqIO.write(record, '{}.cns.fa'.format(id), 'fasta')
#     print(record.id)
1.1
1.2
1.3
1.4
1.5
2.1
2.2
2.3
2.4
3.1
3.2
3.3
3.4
3.5
4.1
4.2
5.1
5.2
6.1
6.2
7.1
7.2
8.1
8.2
8.3
9.1
9.2
10.1
10.2
11.1
11.2
12.1
12.2
13.1
13.2
14.1
14.2

Map with Segemehl

On cluster

In [30]:
ids = ['1.1','1.2','1.3','1.4','1.5','10.1','10.2','11.1','11.2','12.1','12.2','13.1','13.2','14.1','14.2','2.1','2.2','2.3','2.4','3.1','3.2','3.3','3.4','3.5','4.1','4.2','5.1','5.2','6.1','6.2','7.1','7.2','8.1','8.2','8.3','9.1','9.2']
len(ids)
Out[30]:
37
In [31]:
template = ('~/segemehl/segemehl_0_2_0/segemehl.x'
' -t 6 -A 60 -R 50 --maxout 1 -b'
' -d ~/data/phe_hcv_tf/all_cns_man/{id}.cns.fa'
' -x ~/data/phe_hcv_tf/all_cns_man/{id}.cns.idx'
' -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_{id}.R12.fastq.gz'
' | samtools view -b -'
' | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/{id} -')
In [32]:
for id in ids:
    print(template.format(id=id))
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/1.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/1.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_1.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/1.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/1.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/1.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_1.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/1.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/1.3.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/1.3.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_1.3.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/1.3 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/1.4.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/1.4.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_1.4.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/1.4 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/1.5.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/1.5.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_1.5.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/1.5 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/10.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/10.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_10.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/10.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/10.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/10.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_10.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/10.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/11.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/11.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_11.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/11.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/11.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/11.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_11.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/11.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/12.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/12.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_12.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/12.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/12.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/12.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_12.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/12.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/13.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/13.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_13.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/13.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/13.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/13.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_13.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/13.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/14.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/14.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_14.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/14.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/14.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/14.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_14.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/14.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/2.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/2.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_2.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/2.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/2.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/2.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_2.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/2.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/2.3.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/2.3.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_2.3.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/2.3 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/2.4.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/2.4.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_2.4.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/2.4 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/3.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/3.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_3.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/3.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/3.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/3.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_3.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/3.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/3.3.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/3.3.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_3.3.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/3.3 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/3.4.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/3.4.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_3.4.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/3.4 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/3.5.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/3.5.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_3.5.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/3.5 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/4.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/4.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_4.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/4.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/4.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/4.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_4.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/4.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/5.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/5.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_5.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/5.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/5.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/5.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_5.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/5.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/6.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/6.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_6.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/6.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/6.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/6.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_6.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/6.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/7.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/7.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_7.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/7.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/7.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/7.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_7.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/7.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/8.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/8.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_8.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/8.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/8.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/8.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_8.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/8.2 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/8.3.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/8.3.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_8.3.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/8.3 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/9.1.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/9.1.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_9.1.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/9.1 -
~/segemehl/segemehl_0_2_0/segemehl.x -t 6 -A 60 -R 50 --maxout 1 -b -d ~/data/phe_hcv_tf/all_cns_man/9.2.cns.fa -x ~/data/phe_hcv_tf/all_cns_man/9.2.cns.idx -q ~/data/phe_hcv_tf/reads_int/HCV_AVU_AB_9.2.R12.fastq.gz | samtools view -b - | samtools sort -m 4G -o ~/segemehl/phe_hcv_tf/a60_r50_top_cns_man/9.2 -

Subsample mappings with BBTools reformat.sh

In [7]:
cmd = ('/opt/sci/bin/bbmap/reformat.sh'
       ' in={fn}'
       ' out={bam_sub_path}{id}.100k.bam'
       ' samplereadstarget=100000'
       ' mappedonly=t')

for id, fn in ids_bams.items():
    os.system(cmd.format(id=id, fn=fn, bam_sub_path=bam_sub_path))
    print(id)
1.1
1.2
1.3
1.4
1.5
10.1
10.2
11.1
11.2
12.1
12.2
13.1
13.2
14.1
14.2
2.1
2.2
2.3
2.4
3.1
3.2
3.3
3.4
3.5
4.1
4.2
5.1
5.2
6.1
6.2
7.1
7.2
8.1
8.2
8.3
9.1
9.2

kindel weights

Run once on bam or subbed bams, else use load pickled weights dict

In [ ]:
weights = {}
for id, fn in ids_bams_sub.items():
    weights[id] = kindel.weights(fn, relative=True)
    print(id)
loading sequences: 100000it [00:11, 8705.56it/s]
loading sequences: 824it [00:00, 8235.06it/s]
1.1
loading sequences: 100000it [00:11, 8650.36it/s]
loading sequences: 834it [00:00, 8338.68it/s]
1.2
loading sequences: 100000it [00:11, 8569.32it/s]
loading sequences: 843it [00:00, 8428.00it/s]
1.3
loading sequences: 100000it [00:11, 8638.71it/s]
loading sequences: 743it [00:00, 7428.15it/s]
1.4
loading sequences: 100000it [00:11, 8533.74it/s]
loading sequences: 973it [00:00, 9724.17it/s]
1.5
loading sequences: 100000it [00:10, 9463.78it/s]
loading sequences: 991it [00:00, 9899.81it/s]
10.1
loading sequences: 100000it [00:10, 9469.39it/s]
loading sequences: 981it [00:00, 9801.08it/s]
10.2
loading sequences: 100000it [00:10, 9513.23it/s]
loading sequences: 960it [00:00, 9599.23it/s]
11.1
loading sequences: 100000it [00:10, 9526.85it/s]
loading sequences: 980it [00:00, 9792.86it/s]
11.2
loading sequences: 100000it [00:10, 9549.74it/s]
loading sequences: 934it [00:00, 9337.96it/s]
12.1
loading sequences: 100000it [00:10, 9592.12it/s]
loading sequences: 987it [00:00, 9868.13it/s]
12.2
loading sequences: 100000it [00:10, 9758.57it/s]
loading sequences: 973it [00:00, 9727.18it/s]
13.1
loading sequences: 100000it [00:10, 9534.71it/s]
loading sequences: 880it [00:00, 8793.84it/s]
13.2
loading sequences: 100000it [00:10, 9406.51it/s]
loading sequences: 849it [00:00, 8484.91it/s]
14.1
loading sequences: 100000it [00:10, 9486.12it/s]
loading sequences: 833it [00:00, 8327.84it/s]
14.2
loading sequences: 100000it [00:10, 9172.63it/s]
loading sequences: 844it [00:00, 8438.58it/s]
2.1
loading sequences: 100000it [00:11, 9070.81it/s]
loading sequences: 806it [00:00, 8051.54it/s]
2.2
loading sequences: 100000it [00:11, 8823.44it/s]
loading sequences: 785it [00:00, 7843.65it/s]
2.3
loading sequences: 100000it [00:11, 9076.07it/s]
loading sequences: 792it [00:00, 7913.10it/s]
2.4
loading sequences: 100000it [00:11, 8828.18it/s]
loading sequences: 833it [00:00, 8321.93it/s]
3.1
loading sequences: 100000it [00:11, 8803.63it/s]
loading sequences: 821it [00:00, 8198.83it/s]
3.2
loading sequences: 100000it [00:11, 8621.53it/s]
loading sequences: 861it [00:00, 8606.40it/s]
3.3
loading sequences: 100000it [00:11, 8987.33it/s]
loading sequences: 773it [00:00, 7726.75it/s]
3.4
loading sequences: 100000it [00:11, 8495.53it/s]
loading sequences: 825it [00:00, 8245.70it/s]
3.5
loading sequences: 100000it [00:11, 9017.31it/s]
loading sequences: 817it [00:00, 8157.58it/s]
4.1
loading sequences: 100000it [00:11, 9024.69it/s]
loading sequences: 854it [00:00, 8536.91it/s]
4.2
loading sequences: 100000it [00:11, 8888.50it/s]
loading sequences: 812it [00:00, 8118.03it/s]
5.1
loading sequences: 100000it [00:10, 9126.53it/s]
loading sequences: 841it [00:00, 8408.24it/s]
5.2
loading sequences: 100000it [00:11, 8399.57it/s]
loading sequences: 856it [00:00, 8543.06it/s]
6.1
loading sequences: 100000it [00:12, 8324.37it/s]
loading sequences: 800it [00:00, 7993.36it/s]
6.2
loading sequences: 100000it [00:11, 8941.78it/s]
loading sequences: 694it [00:00, 6935.58it/s]
7.1
loading sequences: 100000it [00:11, 8876.98it/s]
loading sequences: 810it [00:00, 8098.73it/s]
7.2
loading sequences: 100000it [00:12, 8118.72it/s]
loading sequences: 746it [00:00, 7453.45it/s]
8.1
loading sequences: 100000it [00:12, 7893.16it/s]
loading sequences: 872it [00:00, 8712.16it/s]
8.2
loading sequences: 100000it [00:11, 8712.26it/s]
loading sequences: 580it [00:00, 5794.49it/s]
8.3
loading sequences: 100000it [00:11, 8562.83it/s]
loading sequences: 920it [00:00, 9192.36it/s]
9.1
loading sequences: 2686it [00:00, 9006.04it/s]
In [ ]:
with open('res/2017-07-04/weights', 'wb') as fh:
    pickle.dump(weights, fh)
In [7]:
with open('res/2017-07-04/weights.p', 'rb') as fh:
    weights = pickle.load(fh)

Group initial and final timepoints

In [8]:
first_weights = {id: wt for id, wt in weights.items() if id.endswith('.1')}
last_ids = [id for id in weights.keys() if id.partition('.')[0] + '.' + str(int(id[-1])+1) not in weights.keys()]
last_weights = {id: wt for id, wt in weights.items() if id in last_ids}

Only positions 342-9377 correspond to polyprotein... restrict sites

https://www.ncbi.nlm.nih.gov/nuccore/NC_004102
Lengths not conformant to H77 coordinates?

Entropy and consensus

In [29]:
# first_weights['1.1']['shannon']
shannons = pd.DataFrame({id: wts['shannon'] for id, wts in weights.items()})
first_shannons = pd.DataFrame({id: weights['shannon'] for id, weights in first_weights.items()})
last_shannons = pd.DataFrame({id: weights['shannon'] for id, weights in last_weights.items()})
first_consensuses = pd.DataFrame({id: weights['consensus'] for id, weights in first_weights.items()})
last_consensuses = pd.DataFrame({id: weights['consensus'] for id, weights in last_weights.items()})

Samplewise median entropy and consensus

In [10]:
pd.DataFrame(dict(first=first_consensuses.median(), last=last_consensuses.median())).plot.bar(logy=True)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x1144b27b8>
In [9]:
pd.DataFrame(dict(first=first_consensuses.median(), last=last_consensuses.median())).plot.bar(logy=True)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x111f42c18>

Positionwise median entropy and consensus

In [16]:
pd.DataFrame(dict(first=first_shannons.median(axis=1), last=last_shannons.median(axis=1))).plot()
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c6b08d0>
In [51]:
pd.DataFrame(dict(first=first_consensuses.median(axis=1), last=last_consensuses.median(axis=1))).iplot()
In [52]:
pd.DataFrame(dict(first=first_consensuses.median(axis=1), last=last_consensuses.median(axis=1))).iplot()
In [49]:
first_consensuses.median(axis=1).rolling(window=5, center=True).median()
Out[49]:
0          NaN
1          NaN
2       0.9690
3       0.9535
4       0.9440
5       0.9440
6       0.9325
7       0.9440
8       0.9440
9       0.9565
10      0.9530
11      0.9760
12      0.9705
13      0.9715
14      0.9715
15      0.9855
16      0.9825
17      0.9845
18      0.9845
19      0.9825
20      0.9825
21      0.9830
22      0.9795
23      0.9830
24      0.9830
25      0.9795
26      0.9795
27      0.9900
28      0.9900
29      0.9915
         ...  
9566    0.9430
9567    0.9620
9568    0.9430
9569    0.9620
9570    0.8030
9571    0.9620
9572    0.9300
9573    0.9300
9574    0.8030
9575    0.7960
9576    0.7570
9577    0.7570
9578    0.7570
9579    0.7960
9580    0.8850
9581    0.8850
9582    0.8870
9583    0.9480
9584    0.9770
9585    0.9860
9586    0.9910
9587    0.9910
9588    0.9950
9589    0.9950
9590    0.9950
9591    0.9880
9592    0.9840
9593    0.9840
9594       NaN
9595       NaN
Length: 9596, dtype: float64
In [11]:
pd.DataFrame(dict(first=first_shannons[50:9300].median(axis=1).rolling(window=25, center=True).median(), last=last_shannons[50:9300].median(axis=1).rolling(window=25, center=True).median())).iplot(colors=['#835AF1', '#F66095'])
In [12]:
pd.DataFrame(dict(first=first_consensuses[50:9300].median(axis=1).rolling(window=20, center=True).median(), last=last_consensuses[50:9300].median(axis=1).rolling(window=20, center=True).median())).iplot(colors=['#835AF1', '#F66095'])
In [13]:
pd.DataFrame(dict(first=first_consensuses.median(axis=1), last=last_consensuses.median(axis=1))).iplot(kind='histogram', barmode='stack', bins=100, histnorm='probability')
In [14]:
first_shannons.iplot()
In [15]:
first_consensuses.iplot()

Positonwise KDplot

In [18]:
hist_data = [first_shannons_flat, last_shannons_flat]

group_labels = ['Initial', 'Final']

colours = ['#F66095', '#2BCDC1']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, colors=colours, bin_size=0.02, show_rug=False)

# Plot!
py.iplot(fig, filename='Distplot with Multiple Datasets')

Diversity KDplot

In [19]:
# Group data together
hist_data = [first_shannons_flat, last_shannons_flat]

group_labels = ['Initial', 'Final']

colours = ['#F66095', '#2BCDC1']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, colors=colours, bin_size=0.02, show_rug=False)

# Plot!
py.iplot(fig, filename='Distplot with Multiple Datasets')

Hists

In [20]:
sns.distplot(weights['1.1'].shannon)
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x116000b38>
In [78]:
first_shannons_flat = first_shannons.replace([np.inf, -np.inf], 0).dropna().values.flatten()
last_shannons_flat = last_shannons.replace([np.inf, -np.inf], 0).dropna().values.flatten()
first_consensuses_flat = first_consensuses.replace([np.inf, -np.inf], 0).dropna().values.flatten()
last_consensuses_flat = last_consensuses.replace([np.inf, -np.inf], 0).dropna().values.flatten()
In [22]:
sns.distplot(last_shannons.replace([np.inf, -np.inf], np.nan).dropna().values.flatten())
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x109acba20>
In [23]:
trace1 = go.Histogram(
    x=first_shannons_flat,
    nbinsx=150,
    opacity=0.33,
    marker=dict(color='#ff9f1c'),
    name='Initial timepoints',
    histnorm='probability'
)
trace2 = go.Histogram(
    x=last_shannons_flat,
    nbinsx=150,
    opacity=0.33,
    marker=dict(color='#2ec4b6'),
    name='Final timepoints',
    histnorm='probability'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='overlay',
    title='Per position median entropy',
    xaxis=dict(range=[0, 1]))
fig = go.Figure(data=data, layout=layout)

py.iplot(fig)
In [27]:
np.median(first_shannons_flat)
Out[27]:
0.051999999999999998
In [28]:
np.median(last_shannons_flat)
Out[28]:
0.049000000000000002
In [24]:
trace1 = go.Histogram(
    x=first_consensuses_flat,
    nbinsx=750,
    opacity=0.33,
    marker=dict(color='#ff9f1c'),
    name='Initial timepoints',
    histnorm='probability'

)
trace2 = go.Histogram(
    x=last_consensuses_flat,
    nbinsx=750,
    opacity=0.33,
    marker=dict(color='#2ec4b6'),
    name='Final timepoints',
    histnorm='probability'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='overlay',
    title='Per position median consensus support',
    xaxis=dict(range=[0.9, 1]))
fig = go.Figure(data=data, layout=layout)

py.iplot(fig)
In [25]:
np.median(first_consensuses_flat)
Out[25]:
0.99199999999999999
In [26]:
np.median(last_consensuses_flat)
Out[26]:
0.99199999999999999
In [31]:
shannons.median()
Out[31]:
1.1     0.093
1.2     0.086
1.3     0.094
1.4     0.089
1.5     0.098
10.1    0.028
10.2    0.024
11.1    0.029
11.2    0.022
12.1    0.029
12.2    0.029
13.1    0.022
13.2    0.020
14.1    0.026
14.2    0.030
2.1     0.056
2.2     0.061
2.3     0.089
2.4     0.081
3.1     0.066
3.2     0.060
3.3     0.066
3.4     0.055
3.5     0.046
4.1     0.064
4.2     0.077
5.1     0.059
5.2     0.052
6.1     0.099
6.2     0.099
7.1     0.067
7.2     0.008
8.1     0.139
8.2     0.162
8.3     0.069
9.1     0.060
9.2     0.067
dtype: float64

Polyprotein only mappings

In [17]:
bam_path = '/Users/Bede/Desktop/bams/'
bam_sub_path = '/Users/Bede/Desktop/bams_100k/'
In [27]:
bams_fns = [fn for fn in os.listdir(bam_path) if fn.endswith('.bam')]

ids_bams = {}
for fn in bams_fns:
    id = fn.replace('.bam','')
    ids_bams[id] = bam_path + fn
ids_bams
Out[27]:
{'1.1': '/Users/Bede/Desktop/bams/1.1.bam',
 '1.2': '/Users/Bede/Desktop/bams/1.2.bam',
 '1.3': '/Users/Bede/Desktop/bams/1.3.bam',
 '1.4': '/Users/Bede/Desktop/bams/1.4.bam',
 '1.5': '/Users/Bede/Desktop/bams/1.5.bam',
 '10.1': '/Users/Bede/Desktop/bams/10.1.bam',
 '10.2': '/Users/Bede/Desktop/bams/10.2.bam',
 '11.1': '/Users/Bede/Desktop/bams/11.1.bam',
 '11.2': '/Users/Bede/Desktop/bams/11.2.bam',
 '12.1': '/Users/Bede/Desktop/bams/12.1.bam',
 '12.2': '/Users/Bede/Desktop/bams/12.2.bam',
 '13.1': '/Users/Bede/Desktop/bams/13.1.bam',
 '13.2': '/Users/Bede/Desktop/bams/13.2.bam',
 '14.1': '/Users/Bede/Desktop/bams/14.1.bam',
 '14.2': '/Users/Bede/Desktop/bams/14.2.bam',
 '2.1': '/Users/Bede/Desktop/bams/2.1.bam',
 '2.2': '/Users/Bede/Desktop/bams/2.2.bam',
 '2.3': '/Users/Bede/Desktop/bams/2.3.bam',
 '2.4': '/Users/Bede/Desktop/bams/2.4.bam',
 '3.1': '/Users/Bede/Desktop/bams/3.1.bam',
 '3.2': '/Users/Bede/Desktop/bams/3.2.bam',
 '3.3': '/Users/Bede/Desktop/bams/3.3.bam',
 '3.4': '/Users/Bede/Desktop/bams/3.4.bam',
 '3.5': '/Users/Bede/Desktop/bams/3.5.bam',
 '4.1': '/Users/Bede/Desktop/bams/4.1.bam',
 '4.2': '/Users/Bede/Desktop/bams/4.2.bam',
 '5.1': '/Users/Bede/Desktop/bams/5.1.bam',
 '5.2': '/Users/Bede/Desktop/bams/5.2.bam',
 '6.1': '/Users/Bede/Desktop/bams/6.1.bam',
 '6.2': '/Users/Bede/Desktop/bams/6.2.bam',
 '7.1': '/Users/Bede/Desktop/bams/7.1.bam',
 '7.2': '/Users/Bede/Desktop/bams/7.2.bam',
 '8.1': '/Users/Bede/Desktop/bams/8.1.bam',
 '8.2': '/Users/Bede/Desktop/bams/8.2.bam',
 '8.3': '/Users/Bede/Desktop/bams/8.3.bam',
 '9.1': '/Users/Bede/Desktop/bams/9.1.bam',
 '9.2': '/Users/Bede/Desktop/bams/9.2.bam'}

Subsample

In [20]:
cmd = ('/opt/sci/bin/bbmap/reformat.sh'
       ' in={fn}'
       ' out={bam_sub_path}{id}.100k.bam'
       ' samplereadstarget=100000'
       ' mappedonly=t')

for id, fn in ids_bams.items():
    os.system(cmd.format(id=id, fn=fn, bam_sub_path=bam_sub_path))
    print(id)
1.1
1.2
1.3
1.4
1.5
10.1
10.2
11.1
11.2
12.1
12.2
13.1
13.2
14.1
14.2
2.1
2.2
2.3
2.4
3.1
3.2
3.3
3.4
3.5
4.1
4.2
5.1
5.2
6.1
6.2
7.1
7.2
8.1
8.2
8.3
9.1
9.2

kindel plot-clips

In [33]:
ids_bams_sub = {}
for fn in [fn for fn in os.listdir(bam_sub_path) if fn.endswith('.bam')]:
    id = fn.replace('.100k.bam','')
    ids_bams_sub[id] = bam_sub_path + fn
ids_bams_sub
Out[33]:
{'1.1': '/Users/Bede/Desktop/bams_100k/1.1.100k.bam',
 '1.2': '/Users/Bede/Desktop/bams_100k/1.2.100k.bam',
 '1.3': '/Users/Bede/Desktop/bams_100k/1.3.100k.bam',
 '1.4': '/Users/Bede/Desktop/bams_100k/1.4.100k.bam',
 '1.5': '/Users/Bede/Desktop/bams_100k/1.5.100k.bam',
 '10.1': '/Users/Bede/Desktop/bams_100k/10.1.100k.bam',
 '10.2': '/Users/Bede/Desktop/bams_100k/10.2.100k.bam',
 '11.1': '/Users/Bede/Desktop/bams_100k/11.1.100k.bam',
 '11.2': '/Users/Bede/Desktop/bams_100k/11.2.100k.bam',
 '12.1': '/Users/Bede/Desktop/bams_100k/12.1.100k.bam',
 '12.2': '/Users/Bede/Desktop/bams_100k/12.2.100k.bam',
 '13.1': '/Users/Bede/Desktop/bams_100k/13.1.100k.bam',
 '13.2': '/Users/Bede/Desktop/bams_100k/13.2.100k.bam',
 '14.1': '/Users/Bede/Desktop/bams_100k/14.1.100k.bam',
 '14.2': '/Users/Bede/Desktop/bams_100k/14.2.100k.bam',
 '2.1': '/Users/Bede/Desktop/bams_100k/2.1.100k.bam',
 '2.2': '/Users/Bede/Desktop/bams_100k/2.2.100k.bam',
 '2.3': '/Users/Bede/Desktop/bams_100k/2.3.100k.bam',
 '2.4': '/Users/Bede/Desktop/bams_100k/2.4.100k.bam',
 '3.1': '/Users/Bede/Desktop/bams_100k/3.1.100k.bam',
 '3.2': '/Users/Bede/Desktop/bams_100k/3.2.100k.bam',
 '3.3': '/Users/Bede/Desktop/bams_100k/3.3.100k.bam',
 '3.4': '/Users/Bede/Desktop/bams_100k/3.4.100k.bam',
 '3.5': '/Users/Bede/Desktop/bams_100k/3.5.100k.bam',
 '4.1': '/Users/Bede/Desktop/bams_100k/4.1.100k.bam',
 '4.2': '/Users/Bede/Desktop/bams_100k/4.2.100k.bam',
 '5.1': '/Users/Bede/Desktop/bams_100k/5.1.100k.bam',
 '5.2': '/Users/Bede/Desktop/bams_100k/5.2.100k.bam',
 '6.1': '/Users/Bede/Desktop/bams_100k/6.1.100k.bam',
 '6.2': '/Users/Bede/Desktop/bams_100k/6.2.100k.bam',
 '7.1': '/Users/Bede/Desktop/bams_100k/7.1.100k.bam',
 '7.2': '/Users/Bede/Desktop/bams_100k/7.2.100k.bam',
 '8.1': '/Users/Bede/Desktop/bams_100k/8.1.100k.bam',
 '8.2': '/Users/Bede/Desktop/bams_100k/8.2.100k.bam',
 '8.3': '/Users/Bede/Desktop/bams_100k/8.3.100k.bam',
 '9.1': '/Users/Bede/Desktop/bams_100k/9.1.100k.bam',
 '9.2': '/Users/Bede/Desktop/bams_100k/9.2.100k.bam'}
In [34]:
for id, fn in ids_bams_sub.items():
    kindel.plotly_clips(fn)
    print(id)
loading sequences: 100000it [00:10, 9904.40it/s]
loading sequences: 667it [00:00, 6669.51it/s]
1.1
loading sequences: 100000it [00:10, 9557.89it/s]
loading sequences: 642it [00:00, 6417.18it/s]
1.2
loading sequences: 100000it [00:10, 9544.06it/s]
loading sequences: 649it [00:00, 6482.17it/s]
1.3
loading sequences: 100000it [00:10, 9719.06it/s]
loading sequences: 693it [00:00, 6924.31it/s]
1.4
loading sequences: 100000it [00:10, 9822.47it/s]
loading sequences: 754it [00:00, 7536.45it/s]
1.5
loading sequences: 100000it [00:10, 9452.38it/s]
loading sequences: 907it [00:00, 9065.73it/s]
10.1
loading sequences: 100000it [00:10, 9571.92it/s]
loading sequences: 880it [00:00, 8792.96it/s]
10.2
loading sequences: 100000it [00:10, 9591.29it/s]
loading sequences: 779it [00:00, 7789.39it/s]
11.1
loading sequences: 100000it [00:10, 9413.98it/s]
loading sequences: 861it [00:00, 8605.88it/s]
11.2
loading sequences: 100000it [00:10, 9247.05it/s]
loading sequences: 837it [00:00, 8367.71it/s]
12.1
loading sequences: 100000it [00:10, 9403.85it/s]
loading sequences: 669it [00:00, 6689.34it/s]
12.2
loading sequences: 100000it [00:10, 9589.87it/s]
loading sequences: 758it [00:00, 7571.67it/s]
13.1
loading sequences: 100000it [00:10, 9315.76it/s]
loading sequences: 822it [00:00, 8211.55it/s]
13.2
loading sequences: 100000it [00:10, 9371.26it/s]
loading sequences: 742it [00:00, 7414.51it/s]
14.1
loading sequences: 100000it [00:10, 9627.71it/s]
loading sequences: 837it [00:00, 8367.26it/s]
14.2
loading sequences: 100000it [00:10, 9383.82it/s]
loading sequences: 821it [00:00, 8206.78it/s]
2.1
loading sequences: 100000it [00:10, 9379.08it/s]
loading sequences: 669it [00:00, 6689.40it/s]
2.2
loading sequences: 100000it [00:10, 9461.64it/s]
loading sequences: 886it [00:00, 8858.76it/s]
2.3
loading sequences: 100000it [00:10, 9443.18it/s]
loading sequences: 764it [00:00, 7639.64it/s]
2.4
loading sequences: 100000it [00:10, 9330.56it/s]
loading sequences: 655it [00:00, 6548.55it/s]
3.1
loading sequences: 100000it [00:10, 9323.55it/s]
loading sequences: 672it [00:00, 6715.65it/s]
3.2
loading sequences: 100000it [00:10, 9255.47it/s]
loading sequences: 645it [00:00, 6448.07it/s]
3.3
loading sequences: 100000it [00:10, 9310.60it/s]
loading sequences: 642it [00:00, 6412.56it/s]
3.4
loading sequences: 100000it [00:10, 9105.76it/s]
loading sequences: 651it [00:00, 6504.08it/s]
3.5
loading sequences: 100000it [00:10, 9331.15it/s]
loading sequences: 929it [00:00, 9281.29it/s]
4.1
loading sequences: 100000it [00:10, 9403.17it/s]
loading sequences: 654it [00:00, 6535.95it/s]
4.2
loading sequences: 100000it [00:10, 9267.41it/s]
loading sequences: 657it [00:00, 6569.60it/s]
5.1
loading sequences: 100000it [00:10, 9212.73it/s]
loading sequences: 797it [00:00, 7969.53it/s]
5.2
loading sequences: 100000it [00:10, 9432.78it/s]
loading sequences: 787it [00:00, 7865.99it/s]
6.1
loading sequences: 100000it [00:10, 9355.93it/s]
loading sequences: 652it [00:00, 6508.28it/s]
6.2
loading sequences: 100000it [00:10, 9189.47it/s]
loading sequences: 0it [00:00, ?it/s]
7.1
loading sequences: 100000it [00:10, 9101.05it/s]
loading sequences: 793it [00:00, 7921.68it/s]
7.2
loading sequences: 100000it [00:10, 9497.18it/s]
loading sequences: 760it [00:00, 7595.73it/s]
8.1
loading sequences: 100000it [00:10, 9750.73it/s]
loading sequences: 688it [00:00, 6876.65it/s]
8.2
loading sequences: 100000it [00:10, 9148.93it/s]
loading sequences: 652it [00:00, 6512.12it/s]
8.3
loading sequences: 100000it [00:11, 8800.79it/s]
loading sequences: 0it [00:00, ?it/s]
9.1
loading sequences: 100000it [00:10, 9170.41it/s]
9.2

kindel weights

In [37]:
weights = {}
for id, fn in ids_bams_sub.items():
    weights[id] = kindel.weights(fn, relative=True)
    print(id)
loading sequences: 100000it [00:10, 9735.24it/s]
loading sequences: 675it [00:00, 6743.40it/s]
1.1
loading sequences: 100000it [00:10, 9720.86it/s]
loading sequences: 681it [00:00, 6809.37it/s]
1.2
loading sequences: 100000it [00:10, 9714.26it/s]
loading sequences: 657it [00:00, 6561.53it/s]
1.3
loading sequences: 100000it [00:10, 9837.01it/s]
loading sequences: 713it [00:00, 7120.67it/s]
1.4
loading sequences: 100000it [00:10, 9914.81it/s]
loading sequences: 814it [00:00, 8136.19it/s]
1.5
loading sequences: 100000it [00:10, 9454.77it/s]
loading sequences: 900it [00:00, 8995.48it/s]
10.1
loading sequences: 100000it [00:10, 9288.48it/s]
loading sequences: 916it [00:00, 9157.83it/s]
10.2
loading sequences: 100000it [00:10, 9524.89it/s]
loading sequences: 811it [00:00, 8109.66it/s]
11.1
loading sequences: 100000it [00:10, 9474.21it/s]
loading sequences: 921it [00:00, 9208.16it/s]
11.2
loading sequences: 100000it [00:10, 9533.21it/s]
loading sequences: 887it [00:00, 8863.77it/s]
12.1
loading sequences: 100000it [00:10, 9631.99it/s]
loading sequences: 703it [00:00, 7024.66it/s]
12.2
loading sequences: 100000it [00:10, 9740.71it/s]
loading sequences: 808it [00:00, 8079.28it/s]
13.1
loading sequences: 100000it [00:10, 9404.50it/s]
loading sequences: 0it [00:00, ?it/s]
13.2
loading sequences: 100000it [00:10, 9471.72it/s]
loading sequences: 754it [00:00, 7535.41it/s]
14.1
loading sequences: 100000it [00:10, 9573.93it/s]
loading sequences: 869it [00:00, 8689.64it/s]
14.2
loading sequences: 100000it [00:10, 9438.80it/s]
loading sequences: 836it [00:00, 8352.22it/s]
2.1
loading sequences: 100000it [00:10, 9447.36it/s]
loading sequences: 707it [00:00, 7062.43it/s]
2.2
loading sequences: 100000it [00:10, 9626.71it/s]
loading sequences: 896it [00:00, 8957.85it/s]
2.3
loading sequences: 100000it [00:10, 9471.17it/s]
loading sequences: 811it [00:00, 8103.77it/s]
2.4
loading sequences: 100000it [00:10, 9276.25it/s]
loading sequences: 682it [00:00, 6808.21it/s]
3.1
loading sequences: 100000it [00:10, 9394.95it/s]
loading sequences: 680it [00:00, 6796.20it/s]
3.2
loading sequences: 100000it [00:10, 9242.82it/s]
loading sequences: 677it [00:00, 6767.63it/s]
3.3
loading sequences: 100000it [00:10, 9451.54it/s]
loading sequences: 668it [00:00, 6676.60it/s]
3.4
loading sequences: 100000it [00:10, 9315.67it/s]
loading sequences: 728it [00:00, 7277.60it/s]
3.5
loading sequences: 100000it [00:10, 9396.74it/s]
loading sequences: 978it [00:00, 9778.12it/s]
4.1
loading sequences: 100000it [00:10, 9420.94it/s]
loading sequences: 669it [00:00, 6685.65it/s]
4.2
loading sequences: 100000it [00:10, 9252.02it/s]
loading sequences: 666it [00:00, 6656.02it/s]
5.1
loading sequences: 100000it [00:10, 9294.77it/s]
loading sequences: 837it [00:00, 8368.23it/s]
5.2
loading sequences: 100000it [00:10, 9334.86it/s]
loading sequences: 819it [00:00, 8187.29it/s]
6.1
loading sequences: 100000it [00:10, 9537.20it/s]
loading sequences: 672it [00:00, 6711.29it/s]
6.2
loading sequences: 100000it [00:10, 9403.14it/s]
loading sequences: 690it [00:00, 6887.48it/s]
7.1
loading sequences: 100000it [00:10, 9365.14it/s]
loading sequences: 829it [00:00, 8282.52it/s]
7.2
loading sequences: 100000it [00:10, 9773.98it/s]
loading sequences: 802it [00:00, 8014.87it/s]
8.1
loading sequences: 100000it [00:10, 9983.01it/s]
loading sequences: 705it [00:00, 7049.86it/s]
8.2
loading sequences: 100000it [00:10, 9439.62it/s]
loading sequences: 0it [00:00, ?it/s]
8.3
loading sequences: 100000it [00:11, 8828.70it/s]
loading sequences: 731it [00:00, 7308.73it/s]
9.1
loading sequences: 100000it [00:10, 9346.73it/s]
9.2
In [110]:
with open('res/2017-07-04/weights_pp', 'wb') as fh:
    pickle.dump(weights, fh)
In [5]:
with open('res/2017-07-04/weights_pp.p', 'rb') as fh:
    weights = pickle.load(fh)

Data prep

In [6]:
first_weights = {id: wt for id, wt in weights.items() if id.endswith('.1')}
last_ids = [id for id in weights.keys() if id.partition('.')[0] + '.' + str(int(id[-1])+1) not in weights.keys()]
last_weights = {id: wt for id, wt in weights.items() if id in last_ids}
In [7]:
shannons = pd.DataFrame({id: wts['shannon'] for id, wts in weights.items()})
first_shannons = pd.DataFrame({id: weights['shannon'] for id, weights in first_weights.items()}).replace([np.inf, -np.inf], np.nan).replace(np.nan, 0)
last_shannons = pd.DataFrame({id: weights['shannon'] for id, weights in last_weights.items()}).replace([np.inf, -np.inf], np.nan).replace(np.nan, 0)
first_consensuses = pd.DataFrame({id: weights['consensus'] for id, weights in first_weights.items()}).replace([np.inf, -np.inf], np.nan).replace(np.nan, 0)
last_consensuses = pd.DataFrame({id: weights['consensus'] for id, weights in last_weights.items()}).replace([np.inf, -np.inf], np.nan).replace(np.nan, 0)
In [8]:
first_shannons_flat = first_shannons.replace([np.inf, -np.inf], np.nan).dropna().values.flatten()
last_shannons_flat = last_shannons.replace([np.inf, -np.inf], np.nan).dropna().values.flatten()
first_consensuses_flat = first_consensuses.replace([np.inf, -np.inf], np.nan).dropna().values.flatten()
last_consensuses_flat = last_consensuses.replace([np.inf, -np.inf], np.nan).dropna().values.flatten()
In [18]:
len(first_shannons_flat)
Out[18]:
128702

Stat tests on first/last timepoints (not used)

Using coords from https://www.ncbi.nlm.nih.gov/nuccore/NC_004102

Overall

In [55]:
mannwhitneyu(x=first_shannons.values.flatten(), y=last_shannons.values.flatten())
Out[55]:
MannwhitneyuResult(statistic=7613042313.0, pvalue=5.5857653626454451e-236)
In [ ]:
mannwhitneyu(x=first_shannons.values.flatten(), y=last_shannons.values.flatten())

E1

In [43]:
first_shannons[915:1490].values.flatten()
Out[43]:
array([ 0.008,  0.014,  0.   , ...,  0.016,  0.016,  0.014])
In [50]:
mannwhitneyu(x=first_shannons[915:1490].values.flatten(), y=last_shannons[915:1490].values.flatten())
Out[50]:
MannwhitneyuResult(statistic=30424334.0, pvalue=9.2794240258800898e-12)
In [51]:
wilcoxon(x=first_shannons[915:1490].values.flatten(), y=last_shannons[915:1490].values.flatten())
Out[51]:
WilcoxonResult(statistic=13013691.5, pvalue=4.7826033851510409e-07)
In [45]:
e1_fs_d = dict(first_shannons[915:1490].median())
e1_fs_d_norm = {k.partition('.')[0]: v for k, v in e1_fs_d.items()}
e1_fs_d_norm_df = pd.DataFrame.from_dict(e1_fs_d_norm, orient='index')
e1_fs_d_norm_df.columns = ['first']

e1_ls_d = dict(last_shannons[915:1490].median())
e1_ls_d_norm = {k.partition('.')[0]: v for k, v in e1_ls_d.items()}
e1_ls_d_norm_df = pd.DataFrame.from_dict(e1_ls_d_norm, orient='index')
e1_ls_d_norm_df.columns = ['last']

e1_fls_df = pd.concat([e1_fs_d_norm_df, e1_ls_d_norm_df], axis=1)
e1_fls_df
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-45-9b93c977ce54> in <module>()
----> 1 e1_fs_d = dict(first_shannons[915:1490].values.flatten())
      2 e1_fs_d_norm = {k.partition('.')[0]: v for k, v in e1_fs_d.items()}
      3 e1_fs_d_norm_df = pd.DataFrame.from_dict(e1_fs_d_norm, orient='index')
      4 e1_fs_d_norm_df.columns = ['first']
      5 

TypeError: cannot convert dictionary update sequence element #0 to a sequence
In [38]:
wilcoxon(x=e1_fls_df['first'].values, y=e1_fls_df['last'].values)
Out[38]:
WilcoxonResult(statistic=36.0, pvalue=0.30028987962532416)
In [27]:
last_shannons[915:1490].median()
Out[27]:
1.5     0.029
10.2    0.022
11.2    0.016
12.2    0.020
13.2    0.014
14.2    0.022
2.4     0.031
3.5     0.029
4.2     0.073
5.2     0.042
6.2     0.016
7.2     0.022
8.3     0.022
9.2     0.024
dtype: float64





In [11]:
weights['14.1'].shannon.plot()
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x1123af2e8>
In [14]:
dict(shannons.median())
Out[14]:
{'1.1': 0.028000000000000001,
 '1.2': 0.025999999999999999,
 '1.3': 0.028000000000000001,
 '1.4': 0.028000000000000001,
 '1.5': 0.021999999999999999,
 '10.1': 0.021999999999999999,
 '10.2': 0.016,
 '11.1': 0.021999999999999999,
 '11.2': 0.014,
 '12.1': 0.021999999999999999,
 '12.2': 0.02,
 '13.1': 0.016,
 '13.2': 0.014,
 '14.1': 0.016,
 '14.2': 0.02,
 '2.1': 0.021999999999999999,
 '2.2': 0.025999999999999999,
 '2.3': 0.028000000000000001,
 '2.4': 0.029999999999999999,
 '3.1': 0.029000000000000001,
 '3.2': 0.028000000000000001,
 '3.3': 0.028000000000000001,
 '3.4': 0.024,
 '3.5': 0.014,
 '4.1': 0.025999999999999999,
 '4.2': 0.035000000000000003,
 '5.1': 0.021999999999999999,
 '5.2': 0.028000000000000001,
 '6.1': 0.024,
 '6.2': 0.021999999999999999,
 '7.1': 0.02,
 '7.2': 0.0080000000000000002,
 '8.1': 0.029999999999999999,
 '8.2': 0.029000000000000001,
 '8.3': 0.02,
 '9.1': 0.029999999999999999,
 '9.2': 0.025999999999999999}
In [49]:
pd.DataFrame(dict(first=first_shannons.median(axis=1), last=last_shannons.median(axis=1))).plot()
Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x11aa6b7f0>
In [56]:
# Group data together
hist_data = [first_shannons_flat, last_shannons_flat]

group_labels = ['Initial', 'Final']

colours = ['#F66095', '#2BCDC1']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, colors=colours, bin_size=0.02, show_rug=False)

# Plot!
py.iplot(fig, filename='Distplot with Multiple Datasets')
In [67]:
pd.DataFrame(dict(first=first_consensuses.median(axis=1), last=last_consensuses.median(axis=1))).iplot(kind='histogram', barmode='stack', bins=100, histnorm='probability')
In [68]:
pd.DataFrame(dict(first=first_consensuses.median(axis=1), last=last_consensuses.median(axis=1))).iplot()
In [89]:
pd.DataFrame(dict(first=first_shannons.median(axis=1).rolling(window=25, center=True).median(), last=last_shannons.median(axis=1).rolling(window=25, center=True).median())).iplot(colors=['#835AF1', '#F66095'])
In [94]:
type(first_shannons_flat)
Out[94]:
numpy.ndarray

last_shannons.median(axis=1).rolling(window=25, center=True).mean()

In [73]:
first_shannons.rolling(window=25, center=True).median()['1.1'].plot()
Out[73]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f03e160>
In [74]:
first_shannons.rolling(window=25, center=True).mean()['1.1'].plot()
Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x1199f6c50>

Take two

In [245]:
pd.DataFrame(dict(first=first_shannons.median(axis=1), last=last_shannons.median(axis=1))).iplot(colors=['#835AF1', '#F66095'])
In [246]:
pd.DataFrame(dict(initial_timepoints_entropy=first_shannons[:9025].median(axis=1).rolling(window=66, center=True).mean(), final_timepoints_entropy=last_shannons[:9025].median(axis=1).rolling(window=75, center=True).mean())).iplot(colors=['#835AF1', '#F66095'], legend=dict(x=0.78,y=1, bgcolor='rgba(0,0,0,0)'))
In [222]:
pd.DataFrame(dict(initial_timepoints_consensus=first_consensuses.median(axis=1)[:9025].rolling(window=66, center=True).mean(), final_timepoints_consensus=last_consensuses[:9025].median(axis=1).rolling(window=66, center=True).mean())).iplot(colors=['#3D9970', '#FF851B'], legend=dict(x=0.76,y=0, bgcolor='rgba(0,0,0,0)'))
In [247]:
matplotlib.style.use('seaborn')
pd.DataFrame(dict(initial_timepoints_entropy=first_shannons[:9025].median(axis=1).rolling(window=66, center=True).mean(), final_timepoints_entropy=last_shannons[:9025].median(axis=1).rolling(window=75, center=True).mean())).plot()
Out[247]:
<matplotlib.axes._subplots.AxesSubplot at 0x1250f0eb8>
In [227]:
matplotlib.style.available
Out[227]:
['bmh',
 'classic',
 'dark_background',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark-palette',
 'seaborn-dark',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'seaborn']
In [236]:
def sinplot(flip=1):
    x = np.linspace(0, 14, 100)
    for i in range(1, 7):
        plt.plot(x, np.sin(x + i * .5) * (7 - i) * flip)
sns.set_style("white")
sns.set_style("ticks")

sinplot()
sns.despine()

Seaborn

In [250]:
fig, ax = plt.subplots(figsize=(10, 2))
pd.DataFrame(dict(initial_timepoints_entropy=first_shannons[:9025].median(axis=1).rolling(window=66, center=True).mean(), final_timepoints_entropy=last_shannons[:9025].median(axis=1).rolling(window=75, center=True).mean())).plot(ax=ax)
Out[250]:
<matplotlib.axes._subplots.AxesSubplot at 0x127c80160>
In [248]:
fig, ax = plt.subplots(figsize=(10, 2))
pd.DataFrame(dict(initial_timepoints_consensus=first_consensuses.median(axis=1)[:9025].rolling(window=66, center=True).mean(), final_timepoints_consensus=last_consensuses[:9025].median(axis=1).rolling(window=66, center=True).mean())).plot(ax=ax)
Out[248]:
<matplotlib.axes._subplots.AxesSubplot at 0x129742550>
In [440]:
f, (ax1, ax2, ax3) = plt.subplots(3, figsize=(10, 6), sharex=True)
# sns.set_palette(sns.hls_palette(2, l=.6, s=.6))
sns.set_palette(sns.xkcd_palette(["windows blue", "dark mint"]))
pd.DataFrame(dict(initial_timepoints_entropy=first_shannons[:9025].median(axis=1).rolling(window=66, center=True).mean(), final_timepoints_entropy=last_shannons[:9025].median(axis=1).rolling(window=75, center=True).mean())).plot(ax=ax1)
sns.set_palette(sns.xkcd_palette(["windows blue", "dark mint"]))
pd.DataFrame(dict(initial_timepoints_consensus=first_consensuses.median(axis=1)[:9025].rolling(window=66, center=True).mean(), final_timepoints_consensus=last_consensuses[:9025].median(axis=1).rolling(window=66, center=True).mean())).plot(ax=ax2)

def patch(start, end, colour):
    return patches.Rectangle((start, 0), end, 1, facecolor=colour)
coords_aa = [1, 192, 384, 747, 810, 1027, 1658, 1712, 1973, 2421, 3012]
coords_nt = [pos*3 for pos in coords_aa] 
coords_fmt = []
for i, pos in enumerate(coords_nt):
    if i == 0:
        coords_fmt.append((0, pos))
    else:
        coords_fmt.append((coords_nt[i-1], pos))
genes = [patch(s, e, col) for (s, e), col in zip(coords_fmt, palette)]

for gene in genes:
    ax3.add_patch(gene)

ax1.set_ylabel('Median entropy')    
ax2.set_ylabel('Median consensus support')
ax3.set_xlabel('HCV polyprotein position (nt)')

ax3.set(yticks=[])

ax3.annotate('C', xy=(0, 0), xytext=(220, 0.5))
ax3.annotate('E1', xy=(0, 0), xytext=(794, 0.5))
ax3.annotate('E2', xy=(0, 0), xytext=(1626, 0.5))
ax3.annotate('P7', xy=(0, 0), xytext=(2250, 0.5))
ax3.annotate('NS2', xy=(0, 0), xytext=(2600, 0.5))
ax3.annotate('NS3', xy=(0, 0), xytext=(3820, 0.5))
ax3.annotate('NS4A', xy=(0, 0), xytext=(4800, 0.5))
ax3.annotate('NS4B', xy=(0, 0), xytext=(5330, 0.5))
ax3.annotate('NS5A', xy=(0, 0), xytext=(6400, 0.5))
ax3.annotate('NS5B', xy=(0, 0), xytext=(7900, 0.5))
sns.set_style('white')
plt.savefig("first_last.svg")
In [ ]:
ax3.se
In [372]:
ax3.annotate('C', xy=(1, 1),xycoords='data',
            xytext=(0.1, 0.5), textcoords='axes fraction',
            arrowprops=dict(facecolor='black', shrink=0.05),
            horizontalalignment='right', verticalalignment='top',
            )
Out[372]:
<matplotlib.text.Annotation at 0x12af54f98>
In [326]:
coords_fmt
Out[326]:
[(0, 3),
 (3, 576),
 (576, 1152),
 (1152, 2241),
 (2241, 2430),
 (2430, 3081),
 (3081, 4974),
 (4974, 5136),
 (5136, 5919),
 (5919, 7263),
 (7263, 9036)]
In [392]:
[int(pos[0]-70+(pos[1]-pos[0])/2) for pos in coords_fmt]
Out[392]:
[-68, 219, 794, 1626, 2265, 2685, 3957, 4985, 5457, 6521, 8079]
In [356]:
sns.palplot(sns.hls_palette(6, l=.6, s=.6))
In [ ]:
sns.palplot(sns.hls_palette(6, l=.6, s=.6))
In [415]:
sns.palplot(sns.hls_palette(4, l=.6, s=.6))
In [429]:
sns.palplot(list(reversed(sns.hls_palette(2, l=.6, s=.6))))
In [405]:
sns.palplot(sns.hls_palette(4, l=.6, s=.6))
In [437]:
coords_fmt
Out[437]:
[(0, 3),
 (3, 576),
 (576, 1152),
 (1152, 2241),
 (2241, 2430),
 (2430, 3081),
 (3081, 4974),
 (4974, 5136),
 (5136, 5919),
 (5919, 7263),
 (7263, 9036)]